### Replication Material ###
### The Manifesto Corpus ### 
### Comparison of Manifesto Drafts and Encated Versions - Figure 2 and 3 #####

library(manifestoR)
library(openNLP)
library(openNLPmodels.de)
library(NLP)
library(stringi)
library(stringr)
library(parallel)
library(ggplot2)
library(dplyr)


# set api key
mp_setapikey(key.file = "manifesto_apikey.txt")

# specify corpus version
mp_use_corpus_version("2016-1")


### FUNCTIONS

## delete separators between quasi-sentences
clean_separators <- function(text) {
  text %>%
    str_replace_all(paste0("\\s*([", intToUtf8(c(61487, 61472)), "\\|]|\\s)*\\s*$"), "") %>%
    str_replace_all("(.*?)/+$", "\\1")
}

## cleaning function
clean <- function(x) {
  x %>%
    clean_separators() %>%
    removePunctuation() %>%
    tolower() %>%
    stripWhitespace()
}

## split text into natural sentences
convert_text_to_sentences <- function(text, lang = "de") {
  
  sentence_token_annotator <- Maxent_Sent_Token_Annotator(language = lang)
  
  # Convert text to class String from package NLP
  text <- as.String(text)
  
  # Sentence boundaries in text
  sentence.boundaries <- NLP::annotate(text, sentence_token_annotator)
  
  # Extract sentences
  sentences <- text[sentence.boundaries]
  
  # return sentences
  return(sentences)
}


## evaluate whether quasi-sentences already included in draft
add_draftinfo <- function(prog, draft_path) {

   draft <- readLines(draft_path) %>% convert_text_to_sentences() %>% clean()
   cleaned <- content(prog) %>% clean()

   # attention: replace mclapply command with lapply on windows computers 
   
   position_in_draft <- mclapply(1:length(prog), 
     function(x) agrep(cleaned[x], draft), mc.cores = 3)

   # binary: in draft or not
   codes(prog, layer="in_draft") <- unlist(mclapply(1:length(prog), 
     function(x) any(position_in_draft[[x]])))

   ## positions in draft (separated by ;)
   codes(prog, layer="positions_in_draft") <- unlist(mclapply(1:length(prog), 
     function(x) paste(as.character(unlist(position_in_draft[[x]])), collapse=";" )))

   # text from draft (separated by ;)
   codes(prog, layer="text_in_draft") <- unlist(mclapply(1:length(prog), 
     function(x) paste(draft[position_in_draft[[x]]], collapse=";")))

   return(prog)
}

## create density plot function
create_densityplot <- function(data, filename, title, alpha) {
   density_plot_color <- rgb(0.5, 0.5, 0.5, 0.1*alpha)
   
   pdf(file=filename, width=3, height=1.2)
   par(mar=c(1,1,1,1), tcl=-0.4)
   data %>%   plot(
      type = "h",
      col = density_plot_color,
      ann = FALSE,
      yaxt = "n",
      xaxs="i",
      yaxs="i",
      xaxt = "n",
      frame.plot=FALSE
   )
   title(title,
         xlab = "xlab",
         ylab = "ylab"
         )   
   dev.off()
}

## comparison plot 
create_comparison_plot <- function(data,filename,filtercodes,graphtitle,legend) {
   data <- data %>% dplyr::filter(code %in% filtercodes)
   r <- ggplot(data=data,aes(x=as.factor(code),fill=(draft==1))) + 
      geom_bar(data=data %>% dplyr::filter(draft==0)) + 
      geom_bar(data=data %>% dplyr::filter(draft==1),aes(y=..count..*(-1))) + 
      scale_y_continuous(breaks=seq(-300,300,100),labels=abs(seq(-300,300,100)), limits=c(-360,360)) + 
      scale_x_discrete(labels=c("peace", "environ't", "equality", "welfare")) +
      ggtitle(graphtitle) +
      xlab("") +
      ylab("") +
      coord_flip() +
      guides(fill=FALSE)
      
   ggsave(r, file=filename, width=3.5, height=2)
}


### SCRIPT PART

# greens
gruene <- mp_corpus(party==41113 & date==201309)[[1]]
gruene_entwurf <- "greens.txt"
gruene_compared <- add_draftinfo(gruene, gruene_entwurf)
df_gruene <- data.frame(code = codes(gruene_compared, layer="cmp_code"), draft = codes(gruene_compared, layer = "in_draft"))
write.csv(df_gruene,file = "greens.csv")

# spd
spd <- mp_corpus(party==41320 & date==201309)[[1]]
spd_entwurf <- "spd.txt"
spd_compared <- add_draftinfo(spd, spd_entwurf)
df_spd <- data.frame(code = codes(spd_compared, layer="cmp_code"), draft = codes(spd_compared, layer = "in_draft"))
write.csv(df_spd,file = "spd.csv")

# figures
df_gruene <- read.csv(file="greens.csv")
df_spd <- read.csv(file="spd.csv")
# account for different length of documents when plotting 
# (to avoid biased visualization due to overplotting)
alpha_gruene <- nrow(df_spd)/nrow(df_gruene)
create_densityplot(data=!df_gruene$draft, 
  filename="gruene_density.pdf","Green Party Manifesto 2013", alpha_gruene)
create_densityplot(data=!df_spd$draft, 
  filename="spd_density.pdf","SPD Manifesto 2013", 1)
create_comparison_plot(data=df_spd, 
  filename="spd_comparison.pdf", filtercodes=c(106,501,503,504), graphtitle="SPD Manifesto 2013", legend=TRUE)
create_comparison_plot(data=df_gruene, 
  filename="gruene_comparison.pdf", filtercodes=c(106,501,503,504), graphtitle="Green Party Manifesto 2013", legend=FALSE)
